This notebook explores using CopyKAT
to estimate tumor and normal cells in SCPCS000492 from SCPCP000015.
CopyKAT was run using the run-copykat.R
script with and without a normal reference. These results are read into
this notebook and used to:
CopyKAT to annotate tumor cells.CopyKAT to cell type
annotations from SingleR and CellAssign.suppressPackageStartupMessages({
# load required packages
library(SingleCellExperiment)
library(ggplot2)
library(copykat)
})
## Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by
## 'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment'
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current")
sample_dir <- file.path(data_dir, "SCPCP000015", params$sample_id)
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-ewings")
# source in helper functions for make_jaccard_matrix() and jaccard()
jaccard_functions <- file.path(module_base, "scripts", "utils", "jaccard-functions.R")
source(jaccard_functions)
# Input files
sce_filename <- glue::glue("{params$library_id}_processed.rds")
sce_file <- file.path(sample_dir, sce_filename)
obj_file <- glue::glue("{params$library_id}_final-copykat.rds")
png_file <- glue::glue("{params$library_id}_copykat_heatmap.jpeg")
copykat_objs <- c(
no_ref = file.path(params$no_ref_copykat_results, obj_file),
with_ref = file.path(params$with_ref_copykat_results, obj_file)
) |>
purrr::map(readr::read_rds)
png_list <- list(
no_ref = file.path(params$no_ref_copykat_results, png_file),
with_ref = file.path(params$with_ref_copykat_results, png_file)
)
# output classifications file
copykat_results_file <- file.path(params$results_dir, glue::glue("{params$library_id}_copykat-classifications.tsv"))
# read in processed sce
sce <- readr::read_rds(sce_file)
# read in tumor normal classifications
manual_classifications_df <- readr::read_tsv(params$marker_gene_classification)
## Rows: 7414 Columns: 2
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): barcodes, marker_gene_classification
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# check if marker gene annotations are present
if(all(is.na(manual_classifications_df$marker_gene_classification))){
has_marker_gene <- FALSE
} else {
has_marker_gene <- TRUE
}
# read in ref cells
normal_cells <- readr::read_tsv(params$reference_cell_file) |>
dplyr::filter(reference_cell_class == "Normal") |>
dplyr::pull(barcodes)
## Rows: 3887 Columns: 4
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): barcodes, reference_cell_class, singler_celltype_annotation, cellas...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read in ck predictions from both reference types (no_normal and with_normal)
ck_results_df <- copykat_objs |>
purrr::map(\(obj){
obj$prediction |>
as.data.frame()
}) |>
dplyr::bind_rows(.id = "reference_used")
# read in full gene by cell copy number detection results
full_ck_results_df <- copykat_objs |>
purrr::map(\(obj){
obj$CNAmat |>
as.data.frame()
}) |>
dplyr::bind_rows(.id = "reference_used")
Below we look at the heatmaps produced by CopyKAT.
9
Below we prepare and plot a UMAP that shows which cells are
classified as diploid, aneuploid, and not defined by
CopyKAT. We show a side by side UMAP with results from
running CopyKAT both with and without a reference of normal
cells.
umap_df <- sce |>
scuttle::makePerCellDF(use.dimred = "UMAP") |>
# replace UMAP.1 with UMAP1
dplyr::rename_with(
\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")
)
cnv_df <- umap_df |>
# first add manual annotations
dplyr::left_join(manual_classifications_df) |>
# now add copykat results
dplyr::left_join(ck_results_df, by = c("barcodes" = "cell.names")) |>
dplyr::mutate(
copykat.pred = dplyr::if_else(
# label any reference cells that were used
(barcodes %in% normal_cells) & (reference_used == "with_ref"),
"reference",
copykat.pred)
)
## Joining with `by = join_by(barcodes)`
ggplot(cnv_df, aes(x = UMAP1, y = UMAP2, color = copykat.pred)) +
geom_point(alpha = 0.5, size = 0.5) +
theme_bw() +
facet_wrap(vars(reference_used))
To validate some of these annotations, we can also look at some commonly found copy number variations found in Ewing sarcoma patients. There are a few known copy number variations in Ewing’s sarcoma:
Although these are the most frequent, there are patients who do not have any of these alterations and patients that only have some of these alterations. See Tirode et al., and Crompton et al..
CopyKAT outputs a matrix that contains the estimated
copy numbers for each gene in each cell. We can read that in and look at
the mean estimated copy numbers for each chromosome across each cell. We
might expect that tumor cells would show an increased estimated copy
number in Chr8, Chr12, and/or Chr 1 and a loss of Chr16.
# for every cell, calculate the mean detection level across all genes in a given chromosome
full_cnv_df <- full_ck_results_df |>
tidyr::pivot_longer(cols = -c(
reference_used,
chrom
),
names_to = "barcodes",
values_to = "cnv_detection") |>
dplyr::group_by(chrom, barcodes, reference_used) |>
dplyr::summarise(mean_cnv_detection = mean(cnv_detection))
## `summarise()` has grouped output by 'chrom', 'barcodes'. You can override using
## the `.groups` argument.
# join with cnv info
cnv_df <- cnv_df |>
dplyr::left_join(full_cnv_df, by = c("barcodes", "reference_used")) |>
dplyr::filter(!is.na(chrom))
Let’s look at the distribution of CNV estimation in cells that are
called aneuploid and diploid by CopyKAT.
# create faceted density plots showing estimation of CNV detection across each chr of interest
# colored by aneuploid/diploid estimation
ggplot(cnv_df, aes(x = mean_cnv_detection, color = copykat.pred)) +
geom_density() +
theme_bw() +
facet_grid(rows = vars(chrom),
cols = vars(reference_used))
Below we directly compare the annotations obtained using manual
classification of tumor and normal cells to annotating cells with
CopyKAT. To do this, we will calculate the confusion matrix
using caret::confusionMatrix().
filtered_cnv_df <- cnv_df |>
dplyr::filter(!(copykat.pred %in% c("not.defined", "reference")),
# filter any low confidence calls
!stringr::str_detect(copykat.pred, "low.conf"))
caret_df_list <- filtered_cnv_df |>
dplyr::mutate(copykat = ifelse(
# use str_detect for test data
copykat.pred == "diploid", "Normal", "Tumor"
)) |>
# make tumor the positive class
dplyr::mutate(
copykat = forcats::fct_relevel(copykat, "Tumor"),
marker_gene_classification = forcats::fct_relevel(marker_gene_classification, "Tumor")
) |>
split(cnv_df$reference_used)
## Warning in split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...): data
## length is not a multiple of split variable
caret_df_list |>
purrr::imap(\(df, ref_type){
caret::confusionMatrix(
table(
df$marker_gene_classification,
df$copykat)
)
})
## $no_ref
## Confusion Matrix and Statistics
##
##
## Tumor Normal
## Tumor 44137 46322
## Normal 36501 31717
##
## Accuracy : 0.478
## 95% CI : (0.4756, 0.4805)
## No Information Rate : 0.5082
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0463
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5473
## Specificity : 0.4064
## Pos Pred Value : 0.4879
## Neg Pred Value : 0.4649
## Prevalence : 0.5082
## Detection Rate : 0.2782
## Detection Prevalence : 0.5701
## Balanced Accuracy : 0.4769
##
## 'Positive' Class : Tumor
##
##
## $with_ref
## Confusion Matrix and Statistics
##
##
## Tumor Normal
## Tumor 45701 44827
## Normal 38318 29808
##
## Accuracy : 0.4759
## 95% CI : (0.4735, 0.4784)
## No Information Rate : 0.5296
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.057
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5439
## Specificity : 0.3994
## Pos Pred Value : 0.5048
## Neg Pred Value : 0.4375
## Prevalence : 0.5296
## Detection Rate : 0.2881
## Detection Prevalence : 0.5706
## Balanced Accuracy : 0.4717
##
## 'Positive' Class : Tumor
##
We can also calculate the Jaccard similarity index to visualize the amount of cells that have overlapping annotations.
# calculate Jaccard similarity index for each reference type
jaccard_matrices <- caret_df_list |>
purrr::map(\(df) {
make_jaccard_matrix(
df,
"marker_gene_classification",
"copykat.pred"
)
})
# Set heatmap padding option
heatmap_padding <- 0.2
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(heatmap_padding, "in"))
# heatmaps comparing tumor/normal annotations manually vs. copyKAT
heatmap <- jaccard_matrices |>
purrr::imap(
\(jaccard_mtx, ref_type) {
ComplexHeatmap::Heatmap(
t(jaccard_mtx), # transpose because matrix rows are in common & we want a vertical arrangement
col = circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue")),
border = TRUE,
## Row parameters
cluster_rows = TRUE,
row_title = ref_type,
row_title_gp = grid::gpar(fontsize = 12),
row_title_side = "left",
row_names_side = "left",
row_dend_side = "right",
row_names_gp = grid::gpar(fontsize = 10),
## Column parameters
cluster_columns = FALSE,
column_title = "",
column_title_gp = grid::gpar(fontsize = 12),
column_names_side = "bottom",
column_names_gp = grid::gpar(fontsize = 10),
column_names_rot = 90,
## Legend parameters
heatmap_legend_param = list(
title = "Jaccard index",
direction = "vertical",
legend_width = unit(1.5, "in")
),
show_heatmap_legend = ref_type == "no_ref",
)
}) |>
# concatenate vertically into HeatmapList object
purrr::reduce(ComplexHeatmap::`%v%`) |>
ComplexHeatmap::draw(
heatmap_legend_side = "right",
# add a margin to the heatmap so labels don't get cut off
padding = unit(c(2, 20, 2, 2), "mm")
)
Lastly, we will compare the annotations from CopyKAT to
those obtained using SingleR and CellAssign by
calculating the Jaccard similarity index. For this comparison we will
use just the annotations from CopyKAT with no
reference.
celltype_columns <- c(
"singler_celltype_annotation",
"cellassign_celltype_annotation"
)
# filter to only get annotations from no ref
no_ref_only <- cnv_df |>
dplyr::filter(reference_used == "no_ref")
# create jaccard matrices for SingleR and CellAssign compared to aneuploid/diploid
jaccard_matrices <- celltype_columns |>
purrr::map(\(name) {
make_jaccard_matrix(
no_ref_only,
"copykat.pred",
name
)
}) |>
purrr::set_names("SingleR", "CellAssign")
# Set heatmap padding option
heatmap_padding <- 0.2
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(heatmap_padding, "in"))
# list of heatmaps looking at SingleR/ CellAssign vs tumor/normal
heatmap <- jaccard_matrices |>
purrr::imap(
\(celltype_mat, celltype_method) {
ComplexHeatmap::Heatmap(
t(celltype_mat), # transpose because matrix rows are in common & we want a vertical arrangement
col = circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue")),
border = TRUE,
## Row parameters
cluster_rows = TRUE,
row_title = celltype_method,
row_title_gp = grid::gpar(fontsize = 12),
row_title_side = "left",
row_names_side = "left",
row_dend_side = "right",
row_names_gp = grid::gpar(fontsize = 10),
## Column parameters
cluster_columns = FALSE,
column_title = "",
column_title_gp = grid::gpar(fontsize = 12),
column_names_side = "bottom",
column_names_gp = grid::gpar(fontsize = 10),
column_names_rot = 90,
## Legend parameters
heatmap_legend_param = list(
title = "Jaccard index",
direction = "vertical",
legend_width = unit(1.5, "in")
),
show_heatmap_legend = celltype_method == "SingleR",
)
}) |>
# concatenate vertically into HeatmapList object
purrr::reduce(ComplexHeatmap::`%v%`) |>
ComplexHeatmap::draw(
heatmap_legend_side = "right",
# add a margin to the heatmap so labels don't get cut off
padding = unit(c(2, 20, 2, 2), "mm")
)
celltype_df <- cnv_df |>
dplyr::select(barcodes,
reference_used,
copykat.pred) |>
dplyr::distinct() |>
tidyr::pivot_wider(
names_from = reference_used,
values_from = copykat.pred
)
readr::write_tsv(celltype_df, copykat_results_file)
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] copykat_1.1.0 ggplot2_3.5.1
## [3] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [5] Biobase_2.64.0 GenomicRanges_1.56.0
## [7] GenomeInfoDb_1.40.0 IRanges_2.38.0
## [9] S4Vectors_0.42.0 BiocGenerics_0.50.0
## [11] MatrixGenerics_1.16.0 matrixStats_1.3.0
##
## loaded via a namespace (and not attached):
## [1] pROC_1.18.5 rlang_1.1.3
## [3] magrittr_2.0.3 clue_0.3-65
## [5] GetoptLong_1.0.5 e1071_1.7-14
## [7] compiler_4.4.0 DelayedMatrixStats_1.26.0
## [9] png_0.1-8 reshape2_1.4.4
## [11] vctrs_0.6.5 stringr_1.5.1
## [13] shape_1.4.6.1 pkgconfig_2.0.3
## [15] crayon_1.5.2 fastmap_1.2.0
## [17] XVector_0.44.0 scuttle_1.14.0
## [19] labeling_0.4.3 utf8_1.2.4
## [21] rmarkdown_2.26 prodlim_2023.08.28
## [23] tzdb_0.4.0 UCSC.utils_1.0.0
## [25] purrr_1.0.2 bit_4.0.5
## [27] xfun_0.44 zlibbioc_1.50.0
## [29] cachem_1.0.8 beachmat_2.20.0
## [31] jsonlite_1.8.8 recipes_1.0.10
## [33] highr_0.10 DelayedArray_0.30.1
## [35] BiocParallel_1.38.0 cluster_2.1.6
## [37] parallel_4.4.0 R6_2.5.1
## [39] RColorBrewer_1.1-3 bslib_0.7.0
## [41] stringi_1.8.4 parallelly_1.37.1
## [43] rpart_4.1.23 lubridate_1.9.3
## [45] jquerylib_0.1.4 Rcpp_1.0.12
## [47] iterators_1.0.14 knitr_1.46
## [49] future.apply_1.11.2 readr_2.1.5
## [51] timechange_0.3.0 Matrix_1.7-0
## [53] splines_4.4.0 nnet_7.3-19
## [55] tidyselect_1.2.1 abind_1.4-5
## [57] yaml_2.3.8 timeDate_4032.109
## [59] doParallel_1.0.17 codetools_0.2-20
## [61] listenv_0.9.1 lattice_0.22-6
## [63] tibble_3.2.1 plyr_1.8.9
## [65] withr_3.0.0 evaluate_0.23
## [67] future_1.33.2 survival_3.5-8
## [69] proxy_0.4-27 circlize_0.4.16
## [71] pillar_1.9.0 BiocManager_1.30.23
## [73] renv_1.0.7 foreach_1.5.2
## [75] generics_0.1.3 vroom_1.6.5
## [77] rprojroot_2.0.4 hms_1.1.3
## [79] sparseMatrixStats_1.16.0 munsell_0.5.1
## [81] scales_1.3.0 globals_0.16.3
## [83] class_7.3-22 glue_1.7.0
## [85] tools_4.4.0 data.table_1.15.4
## [87] ModelMetrics_1.2.2.2 gower_1.0.1
## [89] forcats_1.0.0 grid_4.4.0
## [91] tidyr_1.3.1 ipred_0.9-14
## [93] colorspace_2.1-0 nlme_3.1-164
## [95] GenomeInfoDbData_1.2.12 cli_3.6.2
## [97] fansi_1.0.6 S4Arrays_1.4.0
## [99] ComplexHeatmap_2.20.0 lava_1.8.0
## [101] dplyr_1.1.4 gtable_0.3.5
## [103] sass_0.4.9 digest_0.6.35
## [105] caret_6.0-94 SparseArray_1.4.3
## [107] rjson_0.2.21 farver_2.1.2
## [109] htmltools_0.5.8.1 lifecycle_1.0.4
## [111] hardhat_1.3.1 httr_1.4.7
## [113] GlobalOptions_0.1.2 bit64_4.0.5
## [115] MASS_7.3-60.2